{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-3-0bf37ba220c1>:12: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://cnki.net/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查是否登录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"headerBox\"]/div[1]/div/div/div[4]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 校园网使用ip登录\n",
    "element = driver.find_element_by_id('Button2')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_id('Ecp_loginShowName1')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_id('highSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查窗口位置\n",
    "* 当出现多个窗口时，一定要先检查窗口位置\n",
    "* 每一个窗口在driver中自动生成唯一的窗口id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-02C6B60B795A8EA53531322C910B05D7'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-02C6B60B795A8EA53531322C910B05D7',\n",
       " 'CDwindow-DD1A0A4C932E87C22B9ACD11A505260D']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-8-0321d37a4ccc>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口位置\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-DD1A0A4C932E87C22B9ACD11A505260D'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击期刊检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"doctype-menus keji\"]/li[@data-id=\"xsqk\"]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 勾选期刊类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SCI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"SI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"EI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 北大核心\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"HX\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSSCI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSCD\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSD\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填写query\n",
    "* 可以在高级检索直接检索（只需要不精确查找）\n",
    "* 建议 专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击专业检索\n",
    "driver.find_element_by_name('majorSearch').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'SU=主题,TKA=篇关摘,TI=篇名,KY=关键词,AB=摘要,CO=小标题,FT=全文,AU=作者,FI=第一作者,RP=通讯作者,AF=作者单位,LY=期刊名称,RF=参考文献,FU=基金,             CLC=中图分类号,     SN=ISSN,CN=CN,   DOI=DOI,QKLM=栏目信息,FAF=第一单位,CF=被引频次'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[2]/dl/dd[1]/p').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = 'SU=\"新媒体\" AND (TI=\"人工智能\" OR TI=\"AI\" OR TI=\"大数据\")'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 显示\n",
    "element = driver.find_element_by_id('perPageDiv')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 50\n",
    "element = driver.find_element_by_xpath('//li[@data-val=\"50\"]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/21'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 页数\n",
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取页面内容"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>大数据时代思想政治教育“微”透视</td>\n",
       "      <td>柳海燕</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>63.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据背景下传统媒体突围策略分析</td>\n",
       "      <td>于佳</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>探讨大数据下的广电新闻编辑发展创新</td>\n",
       "      <td>陈媛媛</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>大数据与新媒体时代医学期刊的办刊之路</td>\n",
       "      <td>焦骞; 刘卓; 董军杰; 张爱净</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>大数据时代与校园文化的多元性</td>\n",
       "      <td>翟屿潼</td>\n",
       "      <td>学理论</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>107.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>“人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例</td>\n",
       "      <td>巩晗</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-04-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>创新传媒教育模式 培养现代传媒人才——评《大数据时代传媒教育研究》</td>\n",
       "      <td>郭慧; 陈红梅; 阎瑞华</td>\n",
       "      <td>山西财经大学学报</td>\n",
       "      <td>2021-04-26 10:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>68.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>基于人工智能技术的新媒体交互艺术表达设计</td>\n",
       "      <td>许洋洋</td>\n",
       "      <td>自动化技术与应用</td>\n",
       "      <td>2021-04-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>107.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》</td>\n",
       "      <td>杜菁</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>164.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>播音主持应对人工智能的策略与思考</td>\n",
       "      <td>胡未央</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>大数据背景下媒体融合发展趋势探讨</td>\n",
       "      <td>侯玉娟</td>\n",
       "      <td>广播电视信息</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>96.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>5G新媒体平台大数据系统运维体系的建设</td>\n",
       "      <td>芦丽丽</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>基于大数据的电视节目评价体系构建研究</td>\n",
       "      <td>宋凯; 庞雪芮</td>\n",
       "      <td>湖南工业大学学报(社会科学版)</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>浅析大数据时代传统媒体与新媒体的融合</td>\n",
       "      <td>薛锦瑜</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>145.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>人工智能与新媒体传播双重视域下高校美育实践的改革创新</td>\n",
       "      <td>张建; 高尚</td>\n",
       "      <td>绵阳师范学院学报</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>93.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>探析大数据下传统媒体与新媒体融合发展路径</td>\n",
       "      <td>林珺</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-04-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>144.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>大数据环境下读者阅读行为转变与可视化分析——基于Cite Space</td>\n",
       "      <td>孔洁</td>\n",
       "      <td>兰台世界</td>\n",
       "      <td>2021-04-06</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>Research on Enterprise Human Resource Manageme...</td>\n",
       "      <td>Wang Fang</td>\n",
       "      <td>Journal of Physics: Conference Series</td>\n",
       "      <td>2021-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>大数据背景下高校图书馆服务创新与发展的研究</td>\n",
       "      <td>朱茂富; 孙琳; 高国瑞</td>\n",
       "      <td>内蒙古科技与经济</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>大数据时代高校党建工作创新路径研究</td>\n",
       "      <td>黄璞; 李岩</td>\n",
       "      <td>办公室业务</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>67.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>大数据视域下新媒体环境中地方高校档案管理和公共服务能力建设</td>\n",
       "      <td>郭晓文</td>\n",
       "      <td>赤峰学院学报(自然科学版)</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>大数据时代新媒介视觉艺术现状与价值</td>\n",
       "      <td>涂玉洁</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>42.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>互联网新媒体时代下大数据营销中的伦理建设探析</td>\n",
       "      <td>吕颖迪; 于孟晨</td>\n",
       "      <td>商业文化</td>\n",
       "      <td>2021-03-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>Analysis of The Impact of New Media Tools on C...</td>\n",
       "      <td>Xin Zhou</td>\n",
       "      <td>International Journal of Education and Teachin...</td>\n",
       "      <td>2021-03-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>人工智能主播的应用策略</td>\n",
       "      <td>王梦颖; 李怀苍</td>\n",
       "      <td>宁夏师范学院学报</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>55.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用</td>\n",
       "      <td>陈戈</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>New Media User Behaviour Research Based on Big...</td>\n",
       "      <td>Zhu Zhixuan</td>\n",
       "      <td>Journal of Physics: Conference Series</td>\n",
       "      <td>2021-03-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>新媒体大数据下大学生党建与思政互构性研究</td>\n",
       "      <td>张正; 杨会朴</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>97.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>大数据时代背景下高校思想政治教育创新研究</td>\n",
       "      <td>陈坤; 李佳</td>\n",
       "      <td>思想政治教育研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>420.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践</td>\n",
       "      <td>祝智敏; 李晓雨; 吴振宇</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>AI和大数据技术对新媒体传播的影响及应用分析</td>\n",
       "      <td>田新梅</td>\n",
       "      <td>中国有线电视</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>92.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>大数据时代英语翻译教学新模式的建构——评《大数据时代云端翻转课堂模式下的口译教学探索》</td>\n",
       "      <td>王大维</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>以大数据推进全媒体时代高校思想政治教育——评《新媒体时代高校思想政治教育模式探究》</td>\n",
       "      <td>王谦</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>139.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>基于大数据的消费文化语境下自媒体剽窃等侵权现象研究</td>\n",
       "      <td>陆璐</td>\n",
       "      <td>滁州学院学报</td>\n",
       "      <td>2021-02-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>大数据时代英语媒体的发展战略探析</td>\n",
       "      <td>张敏</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2021-02-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>基于人工智能的传媒企业发展探析</td>\n",
       "      <td>孙芳</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>130.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>大数据背景下传统媒体与新媒体融合路径探析</td>\n",
       "      <td>杨文惠</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-02-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>138.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>大数据背景下传统媒体与新媒体融合发展战略</td>\n",
       "      <td>黄猛猛; 黄瑶</td>\n",
       "      <td>西部广播电视</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>大数据时代高校突发事件网络舆情引导机制研究</td>\n",
       "      <td>陈娟; 康秀平; 许莹莹</td>\n",
       "      <td>声屏世界</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>大数据背景下的农业农村新闻创作导向分析</td>\n",
       "      <td>唐雪莲</td>\n",
       "      <td>声屏世界</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>大数据技术对新闻传播领域的影响分析</td>\n",
       "      <td>李朝敏</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>151.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>《科技日报》人工智能报道研究</td>\n",
       "      <td>赖晨璐; 陶贤都</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>人工智能媒介下文学创作商业走向分析</td>\n",
       "      <td>李娜</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-01-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>79.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>浅谈大数据时代新媒体与传统媒体间的冲击及融合路径</td>\n",
       "      <td>严伊琳</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>76.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>新媒体时代大数据技术对高校教育管理的影响——评《大数据时代高等教育规范化管理研究》</td>\n",
       "      <td>王晶</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>86.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>大数据语境下教育者话语权的重构</td>\n",
       "      <td>陈曦; 张晓世; 王雁冰</td>\n",
       "      <td>教育理论与实践</td>\n",
       "      <td>2021-01-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>大数据技术下新媒体用户画像与隐私安全</td>\n",
       "      <td>曹秦雨</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2020-12-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>682.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>浅谈人工智能在新闻生产领域的应用——以中央广播电视总台为例</td>\n",
       "      <td>罗娜</td>\n",
       "      <td>新闻战线</td>\n",
       "      <td>2020-12-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>基于大数据的广告公司新媒体营销策略研究</td>\n",
       "      <td>郭慧馨; 葛健; 孟凡哲</td>\n",
       "      <td>商业经济</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>617.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>疫情下国际性会议融媒体传播机制——以2020世界人工智能大会云端峰会为例</td>\n",
       "      <td>陈实</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>83.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                                   大数据时代思想政治教育“微”透视   \n",
       "1            2                                   大数据背景下传统媒体突围策略分析   \n",
       "2            3                                  探讨大数据下的广电新闻编辑发展创新   \n",
       "3            4                                 大数据与新媒体时代医学期刊的办刊之路   \n",
       "4            5                                     大数据时代与校园文化的多元性   \n",
       "5            6             “人机协同”在全国两会现场报道中的实践——以人民日报新媒体5G+AI报道为例   \n",
       "6            7                  创新传媒教育模式 培养现代传媒人才——评《大数据时代传媒教育研究》   \n",
       "7            8                               基于人工智能技术的新媒体交互艺术表达设计   \n",
       "8            9              新媒体时代人工智能作品的著作权法地位探讨——评《著作权法前沿热点问题探究》   \n",
       "9           10                                   播音主持应对人工智能的策略与思考   \n",
       "10          11                                   大数据背景下媒体融合发展趋势探讨   \n",
       "11          12                                5G新媒体平台大数据系统运维体系的建设   \n",
       "12          13                                 基于大数据的电视节目评价体系构建研究   \n",
       "13          14                                 浅析大数据时代传统媒体与新媒体的融合   \n",
       "14          15                         人工智能与新媒体传播双重视域下高校美育实践的改革创新   \n",
       "15          16                               探析大数据下传统媒体与新媒体融合发展路径   \n",
       "16          17                 大数据环境下读者阅读行为转变与可视化分析——基于Cite Space   \n",
       "17          18  Research on Enterprise Human Resource Manageme...   \n",
       "18          19                              大数据背景下高校图书馆服务创新与发展的研究   \n",
       "19          20                                  大数据时代高校党建工作创新路径研究   \n",
       "20          21                      大数据视域下新媒体环境中地方高校档案管理和公共服务能力建设   \n",
       "21          22                                  大数据时代新媒介视觉艺术现状与价值   \n",
       "22          23                             互联网新媒体时代下大数据营销中的伦理建设探析   \n",
       "23          24  Analysis of The Impact of New Media Tools on C...   \n",
       "24          25                                        人工智能主播的应用策略   \n",
       "25          26                 智能导播助力2021春晚新媒体节目创新——浅析人工智能切换技术的应用   \n",
       "26          27  New Media User Behaviour Research Based on Big...   \n",
       "27          28                               新媒体大数据下大学生党建与思政互构性研究   \n",
       "28          29                               大数据时代背景下高校思想政治教育创新研究   \n",
       "29          30                   新媒体背景下专业教材出版思路研究——基于人工智能专业教材出版实践   \n",
       "30          31                             AI和大数据技术对新媒体传播的影响及应用分析   \n",
       "31          32        大数据时代英语翻译教学新模式的建构——评《大数据时代云端翻转课堂模式下的口译教学探索》   \n",
       "32          33          以大数据推进全媒体时代高校思想政治教育——评《新媒体时代高校思想政治教育模式探究》   \n",
       "33          34                          基于大数据的消费文化语境下自媒体剽窃等侵权现象研究   \n",
       "34          35                                   大数据时代英语媒体的发展战略探析   \n",
       "35          36                                    基于人工智能的传媒企业发展探析   \n",
       "36          37                               大数据背景下传统媒体与新媒体融合路径探析   \n",
       "37          38                               大数据背景下传统媒体与新媒体融合发展战略   \n",
       "38          39                              大数据时代高校突发事件网络舆情引导机制研究   \n",
       "39          40                                大数据背景下的农业农村新闻创作导向分析   \n",
       "40          41                                  大数据技术对新闻传播领域的影响分析   \n",
       "41          42                                     《科技日报》人工智能报道研究   \n",
       "42          43                                  人工智能媒介下文学创作商业走向分析   \n",
       "43          44                           浅谈大数据时代新媒体与传统媒体间的冲击及融合路径   \n",
       "44          45          新媒体时代大数据技术对高校教育管理的影响——评《大数据时代高等教育规范化管理研究》   \n",
       "45          46                                    大数据语境下教育者话语权的重构   \n",
       "46          47                                 大数据技术下新媒体用户画像与隐私安全   \n",
       "47          48                      浅谈人工智能在新闻生产领域的应用——以中央广播电视总台为例   \n",
       "48          49                                基于大数据的广告公司新媒体营销策略研究   \n",
       "49          50               疫情下国际性会议融媒体传播机制——以2020世界人工智能大会云端峰会为例   \n",
       "\n",
       "                  作者                                                 刊名  \\\n",
       "0                柳海燕                                           中学政治教学参考   \n",
       "1                 于佳                                               中国报业   \n",
       "2                陈媛媛                                               记者摇篮   \n",
       "3   焦骞; 刘卓; 董军杰; 张爱净                                               传媒论坛   \n",
       "4                翟屿潼                                                学理论   \n",
       "5                 巩晗                                               青年记者   \n",
       "6       郭慧; 陈红梅; 阎瑞华                                           山西财经大学学报   \n",
       "7                许洋洋                                           自动化技术与应用   \n",
       "8                 杜菁                                              新闻爱好者   \n",
       "9                胡未央                                               中国报业   \n",
       "10               侯玉娟                                             广播电视信息   \n",
       "11               芦丽丽                                             现代电视技术   \n",
       "12           宋凯; 庞雪芮                                    湖南工业大学学报(社会科学版)   \n",
       "13               薛锦瑜                                               记者摇篮   \n",
       "14            张建; 高尚                                           绵阳师范学院学报   \n",
       "15                林珺                                               传媒论坛   \n",
       "16                孔洁                                               兰台世界   \n",
       "17         Wang Fang              Journal of Physics: Conference Series   \n",
       "18      朱茂富; 孙琳; 高国瑞                                           内蒙古科技与经济   \n",
       "19            黄璞; 李岩                                              办公室业务   \n",
       "20               郭晓文                                      赤峰学院学报(自然科学版)   \n",
       "21               涂玉洁                                               中国报业   \n",
       "22          吕颖迪; 于孟晨                                               商业文化   \n",
       "23          Xin Zhou  International Journal of Education and Teachin...   \n",
       "24          王梦颖; 李怀苍                                           宁夏师范学院学报   \n",
       "25                陈戈                                             现代电视技术   \n",
       "26       Zhu Zhixuan              Journal of Physics: Conference Series   \n",
       "27           张正; 杨会朴                                               文化产业   \n",
       "28            陈坤; 李佳                                           思想政治教育研究   \n",
       "29     祝智敏; 李晓雨; 吴振宇                                             中国传媒科技   \n",
       "30               田新梅                                             中国有线电视   \n",
       "31               王大维                                             中国科技论文   \n",
       "32                王谦                                             中国科技论文   \n",
       "33                陆璐                                             滁州学院学报   \n",
       "34                张敏                                             新闻研究导刊   \n",
       "35                孙芳                                               传媒论坛   \n",
       "36               杨文惠                                               传媒论坛   \n",
       "37           黄猛猛; 黄瑶                                             西部广播电视   \n",
       "38      陈娟; 康秀平; 许莹莹                                               声屏世界   \n",
       "39               唐雪莲                                               声屏世界   \n",
       "40               李朝敏                                               传媒论坛   \n",
       "41          赖晨璐; 陶贤都                                               科技传播   \n",
       "42                李娜                                               中国报业   \n",
       "43               严伊琳                                             中国传媒科技   \n",
       "44                王晶                                             中国科技论文   \n",
       "45      陈曦; 张晓世; 王雁冰                                            教育理论与实践   \n",
       "46               曹秦雨                                             新闻研究导刊   \n",
       "47                罗娜                                               新闻战线   \n",
       "48      郭慧馨; 葛健; 孟凡哲                                               商业经济   \n",
       "49                陈实                                               青年记者   \n",
       "\n",
       "                发表时间   被引     下载   操作  \n",
       "0         2021-05-25  NaN   63.0   下载  \n",
       "1         2021-05-25  NaN   54.0   下载  \n",
       "2         2021-05-15  NaN    NaN   下载  \n",
       "3         2021-05-10  NaN   34.0   下载  \n",
       "4         2021-05-05  NaN  107.0   下载  \n",
       "5         2021-04-30  NaN   56.0   下载  \n",
       "6   2021-04-26 10:35  NaN   68.0   下载  \n",
       "7         2021-04-25  NaN  107.0   下载  \n",
       "8         2021-04-20  NaN  164.0   下载  \n",
       "9         2021-04-15  NaN   36.0   下载  \n",
       "10        2021-04-15  NaN   96.0   下载  \n",
       "11        2021-04-15  NaN   20.0   下载  \n",
       "12        2021-04-15  NaN   14.0   下载  \n",
       "13        2021-04-15  NaN  145.0   下载  \n",
       "14        2021-04-15  NaN   93.0   下载  \n",
       "15        2021-04-08  NaN  144.0   下载  \n",
       "16        2021-04-06  NaN   28.0   下载  \n",
       "17        2021-04-01  NaN    NaN  NaN  \n",
       "18        2021-03-31  NaN   18.0   下载  \n",
       "19        2021-03-25  NaN   67.0   下载  \n",
       "20        2021-03-25  NaN   23.0   下载  \n",
       "21        2021-03-25  NaN   42.0   下载  \n",
       "22        2021-03-25  NaN   24.0   下载  \n",
       "23        2021-03-20  NaN    NaN  NaN  \n",
       "24        2021-03-15  NaN   55.0   下载  \n",
       "25        2021-03-15  NaN   40.0   下载  \n",
       "26        2021-03-01  NaN    NaN  NaN  \n",
       "27        2021-02-28  NaN   97.0   下载  \n",
       "28        2021-02-20  NaN  420.0   下载  \n",
       "29        2021-02-15  NaN   27.0   下载  \n",
       "30        2021-02-15  NaN   92.0   下载  \n",
       "31        2021-02-15  NaN  103.0   下载  \n",
       "32        2021-02-15  NaN  139.0   下载  \n",
       "33        2021-02-15  NaN   36.0   下载  \n",
       "34        2021-02-10  NaN   25.0   下载  \n",
       "35        2021-02-07  NaN  130.0   下载  \n",
       "36        2021-02-07  NaN  138.0   下载  \n",
       "37        2021-02-05  NaN   12.0   下载  \n",
       "38        2021-02-05  NaN   24.0   下载  \n",
       "39        2021-02-05  NaN    5.0   下载  \n",
       "40        2021-01-25  NaN  151.0   下载  \n",
       "41        2021-01-25  NaN   38.0   下载  \n",
       "42        2021-01-22  NaN   79.0   下载  \n",
       "43        2021-01-15  2.0   76.0   下载  \n",
       "44        2021-01-15  NaN   86.0   下载  \n",
       "45        2021-01-10  NaN   54.0   下载  \n",
       "46        2020-12-25  NaN  682.0   下载  \n",
       "47        2020-12-23  NaN   20.0   下载  \n",
       "48        2020-12-20  NaN  617.0   下载  \n",
       "49        2020-12-20  NaN   83.0   下载  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 首页页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "首页页面数据 = pd.read_html(page_html)[0]\n",
    "首页页面数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 异常处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://kns.cnki.net/KNS8/Brief/VerifyCode?t=a8f397de-0795-4d94-9454-75dfd8e7269c'"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# image_url = driver.find_element_by_id('changeVercode').get_attribute('src')\n",
    "# image_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def baidu_API_OCR(image_url):\n",
    "#     # 1.获取百度token\n",
    "#     # client_id 为官网获取的AK， client_secret 为官网获取的SK\n",
    "#     host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=rGE4dOG8q8p5G6C6EiQRZPLl&client_secret=LMIbr14LgDwHaiwIomw6S2A3bCz8ErdV'\n",
    "#     response = requests.get(host)\n",
    "#     if response:\n",
    "#         access_token = response.json()[\"access_token\"]\n",
    "#     # 2.请求图片的验证信息\n",
    "#     request_url = \"https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic\"\n",
    "#     params = {\n",
    "#         \"url\":image_url\n",
    "#     }\n",
    "#     request_url = request_url + \"?access_token=\" + access_token\n",
    "#     headers = {'content-type': 'application/x-www-form-urlencoded'}\n",
    "#     response = requests.post(request_url, data=params, headers=headers)\n",
    "#     results = response.json()[\"words_result\"][0][\"words\"]\n",
    "#     return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import requests \n",
    "\n",
    "# for page in range(0,120):\n",
    "#     if driver.find_element_by_id('changeVercode'):\n",
    "#         # 解决验证码\n",
    "#         # 调用一个解决验证码的方法func 返回一个结果result 就是vercode\n",
    "#         image_url = driver.find_element_by_id('changeVercode').get_attribute('src')\n",
    "#         vercode = baidu_API_OCR(image_url)\n",
    "#         driver.find_element_by_id('vericode').clear\n",
    "#         driver.find_element_by_id('vericode').send_keys(vercode)\n",
    "#         driver.find_element_by_id('checkCodeBtn').click\n",
    "        \n",
    "#     else:\n",
    "#         driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()\n",
    "#         time.sleep(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下一页\n",
    "element = driver.find_element_by_id('PageNext')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n"
     ]
    }
   ],
   "source": [
    "# 所有页数\n",
    "pages = list(range(1,21))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 翻页\n",
    "def process_pages(pages):\n",
    "    for p in pages:\n",
    "        print(p,end=\"\\t\")\n",
    "        # 点击下一页进行跳转\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设置休息时间，避免报错和验证码\n",
    "        time.sleep(20+10*random())\n",
    "        # 获取含有页面数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "19  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "20  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "# 指定内容输出位置\n",
    "fn = {\"output\":{\"htm_snippets\": \"data_raw_src/知网_htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 保存页面内容的csv文件\n",
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>大数据时代思想政治教育“微”透视</td>\n",
       "      <td>柳海燕</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>63.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据背景下传统媒体突围策略分析</td>\n",
       "      <td>于佳</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>探讨大数据下的广电新闻编辑发展创新</td>\n",
       "      <td>陈媛媛</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>大数据与新媒体时代医学期刊的办刊之路</td>\n",
       "      <td>焦骞; 刘卓; 董军杰; 张爱净</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>大数据时代与校园文化的多元性</td>\n",
       "      <td>翟屿潼</td>\n",
       "      <td>学理论</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>107.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>968</th>\n",
       "      <td>1019</td>\n",
       "      <td>媒体的用户关系管理应建立基于大数据的管理理念</td>\n",
       "      <td>陈娟</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2013-01-08</td>\n",
       "      <td>6.0</td>\n",
       "      <td>344.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>969</th>\n",
       "      <td>1020</td>\n",
       "      <td>直击趋势 大数据开启营销新未来——“2012中国创新营销峰会”彰显营销魅力</td>\n",
       "      <td>NaN</td>\n",
       "      <td>成功营销</td>\n",
       "      <td>2012-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>501.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>970</th>\n",
       "      <td>1021</td>\n",
       "      <td>当新媒体遇到“大数据”</td>\n",
       "      <td>高鹏</td>\n",
       "      <td>广播与电视技术</td>\n",
       "      <td>2012-10-15</td>\n",
       "      <td>23.0</td>\n",
       "      <td>2281.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971</th>\n",
       "      <td>1022</td>\n",
       "      <td>\"Giornalista settore scienza. Requisiti richie...</td>\n",
       "      <td>Nico Pitrelli</td>\n",
       "      <td>S&amp;F_scienzaefilosofia.it</td>\n",
       "      <td>2009-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>972</th>\n",
       "      <td>1023</td>\n",
       "      <td>From kino-eye to anime -eye/ ai : the filmed a...</td>\n",
       "      <td>Mark Driscoll</td>\n",
       "      <td>Japan Forum</td>\n",
       "      <td>2002-09-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1023 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0             1                                   大数据时代思想政治教育“微”透视   \n",
       "1             2                                   大数据背景下传统媒体突围策略分析   \n",
       "2             3                                  探讨大数据下的广电新闻编辑发展创新   \n",
       "3             4                                 大数据与新媒体时代医学期刊的办刊之路   \n",
       "4             5                                     大数据时代与校园文化的多元性   \n",
       "..          ...                                                ...   \n",
       "968        1019                             媒体的用户关系管理应建立基于大数据的管理理念   \n",
       "969        1020              直击趋势 大数据开启营销新未来——“2012中国创新营销峰会”彰显营销魅力   \n",
       "970        1021                                        当新媒体遇到“大数据”   \n",
       "971        1022  \"Giornalista settore scienza. Requisiti richie...   \n",
       "972        1023  From kino-eye to anime -eye/ ai : the filmed a...   \n",
       "\n",
       "                   作者                        刊名        发表时间    被引      下载   操作  \n",
       "0                 柳海燕                  中学政治教学参考  2021-05-25   NaN    63.0   下载  \n",
       "1                  于佳                      中国报业  2021-05-25   NaN    54.0   下载  \n",
       "2                 陈媛媛                      记者摇篮  2021-05-15   NaN     NaN   下载  \n",
       "3    焦骞; 刘卓; 董军杰; 张爱净                      传媒论坛  2021-05-10   NaN    34.0   下载  \n",
       "4                 翟屿潼                       学理论  2021-05-05   NaN   107.0   下载  \n",
       "..                ...                       ...         ...   ...     ...  ...  \n",
       "968                陈娟                    中国传媒科技  2013-01-08   6.0   344.0   下载  \n",
       "969               NaN                      成功营销  2012-12-15   NaN   501.0   下载  \n",
       "970                高鹏                   广播与电视技术  2012-10-15  23.0  2281.0   下载  \n",
       "971     Nico Pitrelli  S&F_scienzaefilosofia.it  2009-06-15   NaN     NaN  NaN  \n",
       "972     Mark Driscoll               Japan Forum  2002-09-01   NaN     NaN  NaN  \n",
       "\n",
       "[1023 rows x 8 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表格 = 首页页面数据.append(df_url_out)\n",
    "df_总表格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>大数据时代思想政治教育“微”透视</td>\n",
       "      <td>柳海燕</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>63.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据背景下传统媒体突围策略分析</td>\n",
       "      <td>于佳</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>探讨大数据下的广电新闻编辑发展创新</td>\n",
       "      <td>陈媛媛</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>大数据与新媒体时代医学期刊的办刊之路</td>\n",
       "      <td>焦骞; 刘卓; 董军杰; 张爱净</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>大数据时代与校园文化的多元性</td>\n",
       "      <td>翟屿潼</td>\n",
       "      <td>学理论</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>107.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>968</th>\n",
       "      <td>1019</td>\n",
       "      <td>媒体的用户关系管理应建立基于大数据的管理理念</td>\n",
       "      <td>陈娟</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2013-01-08</td>\n",
       "      <td>6.0</td>\n",
       "      <td>344.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>969</th>\n",
       "      <td>1020</td>\n",
       "      <td>直击趋势 大数据开启营销新未来——“2012中国创新营销峰会”彰显营销魅力</td>\n",
       "      <td>NaN</td>\n",
       "      <td>成功营销</td>\n",
       "      <td>2012-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>501.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>970</th>\n",
       "      <td>1021</td>\n",
       "      <td>当新媒体遇到“大数据”</td>\n",
       "      <td>高鹏</td>\n",
       "      <td>广播与电视技术</td>\n",
       "      <td>2012-10-15</td>\n",
       "      <td>23.0</td>\n",
       "      <td>2281.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971</th>\n",
       "      <td>1022</td>\n",
       "      <td>\"Giornalista settore scienza. Requisiti richie...</td>\n",
       "      <td>Nico Pitrelli</td>\n",
       "      <td>S&amp;F_scienzaefilosofia.it</td>\n",
       "      <td>2009-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>972</th>\n",
       "      <td>1023</td>\n",
       "      <td>From kino-eye to anime -eye/ ai : the filmed a...</td>\n",
       "      <td>Mark Driscoll</td>\n",
       "      <td>Japan Forum</td>\n",
       "      <td>2002-09-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1023 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0             1                                   大数据时代思想政治教育“微”透视   \n",
       "1             2                                   大数据背景下传统媒体突围策略分析   \n",
       "2             3                                  探讨大数据下的广电新闻编辑发展创新   \n",
       "3             4                                 大数据与新媒体时代医学期刊的办刊之路   \n",
       "4             5                                     大数据时代与校园文化的多元性   \n",
       "..          ...                                                ...   \n",
       "968        1019                             媒体的用户关系管理应建立基于大数据的管理理念   \n",
       "969        1020              直击趋势 大数据开启营销新未来——“2012中国创新营销峰会”彰显营销魅力   \n",
       "970        1021                                        当新媒体遇到“大数据”   \n",
       "971        1022  \"Giornalista settore scienza. Requisiti richie...   \n",
       "972        1023  From kino-eye to anime -eye/ ai : the filmed a...   \n",
       "\n",
       "                   作者                        刊名        发表时间    被引      下载   操作  \n",
       "0                 柳海燕                  中学政治教学参考  2021-05-25   NaN    63.0   下载  \n",
       "1                  于佳                      中国报业  2021-05-25   NaN    54.0   下载  \n",
       "2                 陈媛媛                      记者摇篮  2021-05-15   NaN     NaN   下载  \n",
       "3    焦骞; 刘卓; 董军杰; 张爱净                      传媒论坛  2021-05-10   NaN    34.0   下载  \n",
       "4                 翟屿潼                       学理论  2021-05-05   NaN   107.0   下载  \n",
       "..                ...                       ...         ...   ...     ...  ...  \n",
       "968                陈娟                    中国传媒科技  2013-01-08   6.0   344.0   下载  \n",
       "969               NaN                      成功营销  2012-12-15   NaN   501.0   下载  \n",
       "970                高鹏                   广播与电视技术  2012-10-15  23.0  2281.0   下载  \n",
       "971     Nico Pitrelli  S&F_scienzaefilosofia.it  2009-06-15   NaN     NaN  NaN  \n",
       "972     Mark Driscoll               Japan Forum  2002-09-01   NaN     NaN  NaN  \n",
       "\n",
       "[1023 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 将表格存在本地\n",
    "with pd.ExcelWriter('Selenium知网数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_总表格)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出refworks（.txt）文件和下载原文"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 每次全选不超过500篇\n",
    "# 设置休息时间避免出现报错和验证码；若出现使用API进行处理\n",
    "# 页面跳转问题一定要切换对应的窗口进行操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 翻页后回到首页\n",
    "element = driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "导出文件_html = dict()\n",
    "main_content_ =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "# 循环：点击全选-跳转下一页 每次不超过500篇所以分两次进行导出下载\n",
    "\n",
    "# 第1次\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_choose(pages):\n",
    "    for p in pages:\n",
    "        print(p,end=\"\\t\")\n",
    "        # 点击全选\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        # 点击跳转下一页\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设置休息时间，避免报错和验证码\n",
    "        time.sleep(20+10*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 导出与分析\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 导出文献\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-02C6B60B795A8EA53531322C910B05D7',\n",
       " 'CDwindow-DD1A0A4C932E87C22B9ACD11A505260D',\n",
       " 'CDwindow-62B97979A112FA24FD66CF8666651196']"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-52-889a0c377e4b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出.txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-54-da0490c03848>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 返回文献列表页窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 批量下载\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-56-f9282ec4a786>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 批量下载已选500篇文献\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-58-25c5f7b5163b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 第2次\n",
    "\n",
    "# 先返回文献列表页窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除已选的500篇文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(10,20))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 导出与分析\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 导出文献\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-02C6B60B795A8EA53531322C910B05D7',\n",
       " 'CDwindow-DD1A0A4C932E87C22B9ACD11A505260D',\n",
       " 'CDwindow-62B97979A112FA24FD66CF8666651196',\n",
       " 'CDwindow-9C5B441ACF0DDA6550BDAF223DE0ECFF',\n",
       " 'CDwindow-CF985F5F21A9ED62956DFACE5AAA3222']"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-66-e99823b384da>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[4])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出.txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-68-25c5f7b5163b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 返回文献列表页窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 批量下载\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-70-0046eff88fda>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[5])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 批量下载已选500篇文献\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
